How many price requests could be cached (have same pickup and return location, source country and driver age) for 1 hour, 3 hours, 12 hours? Per contractor.
Graphs show - at full hours (ex. every 3 hours, every 12 hours). What portion of the requests have duplicated parameters (per broker_contract_id) in the last 1,3,12 hours. After having got data by broker_contract_id it is aggregated into broker data in whole.
Note that for these graphs we have already removed duplicate request_uuid.
Currently some graphs look weird because we only look at every 1 hour or every 3 hours or every 12 hours..
In second case we also force return_timestamp and pickup_timestamp to also be the same.
From request data see how many responses are exactly the same.
To start we'll investigate "Charge_log" column
import pandas as pd
import csv
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (14, 6)
%load_ext autoreload
%autoreload 2
fields = ["pickup_timestamp", "timestamp",
"source_country_region_id", "driver_age", "rental_days",
"broker_contract_id", "pickup_desk_id", "return_desk_id"]
df = pd.read_csv("rate_quote.csv", skipinitialspace=True, usecols=fields)
start_size = len(df)
print(start_size)
grouped = df.groupby("charge_log")
with open("test.op", "w") as f:
i = 0
for group in grouped:
for g in group[1].itertuples():
f.write(str(g) +"\n")
i += 1
f.write("\n")
if i == 30:
break
# Keeps first of each duplicate
dropped = df.drop_duplicates("charge_log")
len(dropped)
cached_percentage = 100 - len(dropped)/start_size*100
print("Percentage of duplicate responses is {}".format(cached_percentage))
1) Can make a line graph again with y-values being % of same location, source country, driver age in 1 hour frame.. For all hours? 5-minutes intervals? In a day.
same pickup and return location, source country and driver age
Questions: Does it matter for which day the price requests are made for? Ex. pickup_timestamp and return_timestamp?
Should be done for unfiltered data
fields = ["timestamp", "driver_age", "source_country_region_id", "pickup_desk_id", "return_desk_id", "return_timestamp", "pickup_timestamp"]
df_new = pd.read_csv("rate_quote.csv", skipinitialspace=True, usecols=fields)
df_new["timestamp"] = pd.to_datetime(df_new['timestamp'])
df_new["pickup_timestamp"] = pd.to_datetime(df_new['pickup_timestamp'])
df_new["return_timestamp"] = pd.to_datetime(df_new['return_timestamp'])
df_new = df_new.set_index("timestamp")
df_new_groups = df_new.groupby(pd.Grouper(freq='60Min'))
fields = ["timestamp", "pickup_timestamp", "return_timestamp", "broker_contract_id",
"source_country_region_id", "driver_age", "pickup_desk_id", "return_desk_id", "rental_days",
"Broker_name"]
df_dups = pd.read_csv("rate_quote_1_dup2.csv", skipinitialspace=True, usecols=fields)
print(df_dups.shape)
df_dups["timestamp"] = pd.to_datetime(df_dups['timestamp'])
df_dups["pickup_timestamp"] = pd.to_datetime(df_dups['pickup_timestamp'])
df_dups["return_timestamp"] = pd.to_datetime(df_dups['return_timestamp'])
df_new_dups = df_dups.set_index("timestamp")
df_new_dups = df_new_dups.groupby(pd.Grouper(freq='60Min'))
1) source, pickup, return, driver age same
2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
3) 3rd case pickup day, driver age, pickup location, drop location, rental days length -- for broker contract id all..
from collections import defaultdict
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id", "pickup_desk_id", "return_desk_id"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
print(total)
print(sum(by_hour))
proportion = [by_hour[i]/total_by_hour[i] for i in range(len(by_hour))]
print(proportion)
from pprint import pprint
# Merge with broker contract name
fields = ["Contract_ID", "Contract_name", "Broker name"]
contract_df = pd.read_csv("broker_contracts.csv", skipinitialspace=True, usecols=fields)
# This holds contract names and which broker name corresponds to each contract id..
#contract_df = contract_df.set_index("Contract_ID")
print(contract_df.shape)
print(contract_df.dtypes)
from collections import defaultdict
broker_contract_ids = defaultdict(set)
for row in contract_df.itertuples():
#print(row)
broker_contract_ids[row[3]].add(row[1])
print(broker_contract_ids)
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours", kind="line", colormap="gist_gray")#, width=0.75)
tPlot.set_ylim(0, 1.1)
# 3 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2 = df_new_dups_2.groupby(pd.Grouper(freq='180Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_2:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id", "pickup_desk_id", "return_desk_id"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours (by 3 hours)", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
print(total)
print(sum(by_hour))
from pprint import pprint
pprint(by_contract_id.keys())
pprint(by_contract_id[21])
# 12 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2 = df_new_dups_2.groupby(pd.Grouper(freq='720Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_2:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id", "pickup_desk_id", "return_desk_id"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours (by 12 hours)", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
In this case we force pickup_timestamp and return_timestamp to also be the same.
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
# 6 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2 = df_new_dups_2.groupby(pd.Grouper(freq='60Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_2:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "return_timestamp"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours with same pickup and return dates (by 1 hour interval)", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
# 6 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2 = df_new_dups_2.groupby(pd.Grouper(freq='180Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_2:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "return_timestamp"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours with same pickup and return dates (by 3 hour interval)", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
# 6 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2 = df_new_dups_2.groupby(pd.Grouper(freq='720Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_2:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "return_timestamp"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours with same pickup and return dates (by 12 hour interval)", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
Same pickup location, same dropoff location, same pickup day, same duration
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
# 6 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2["pickup_timestamp"] = df_new_dups_2["pickup_timestamp"].dt.normalize()
df_new_dups_grouped = df_new_dups_2.groupby(pd.Grouper(freq='60Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_grouped:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "rental_days"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data[contractor]:
merged_data[contractor][k[0]][0] += k[1]
merged_data[contractor][k[0]][1] += k[2]
else:
merged_data[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data[contractor]).T
if (merged_data[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours (caching 1 hour)", kind="line")
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
# 3 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2["pickup_timestamp"] = df_new_dups_2["pickup_timestamp"].dt.normalize()
#print(df_new_dups_2)
df_new_dups_grouped = df_new_dups_2.groupby(pd.Grouper(freq='180Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_grouped:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "rental_days"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data_2 = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data_2:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data_2[contractor]:
merged_data_2[contractor][k[0]][0] += k[1]
merged_data_2[contractor][k[0]][1] += k[2]
else:
merged_data_2[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data_2[contractor]).T
if (merged_data_2[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours (caching 3 hours) ", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
# 3 hours
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2["pickup_timestamp"] = df_new_dups_2["pickup_timestamp"].dt.normalize()
#print(df_new_dups_2)
df_new_dups_grouped = df_new_dups_2.groupby(pd.Grouper(freq='720Min'))
by_contract_id = defaultdict(list)
total = 0
by_hour = []
total_by_hour = []
for group in df_new_dups_grouped:
# Group by broker_contract_id
in_group_cache_total = 0
for g in group[1].groupby(["broker_contract_id"]):
non_dup = g[1].drop_duplicates(["driver_age", "source_country_region_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "rental_days"])
in_group_cache = len(g[1]) - len(non_dup)
in_group_cache_total += in_group_cache
by_contract_id[g[0]].append((group[0],in_group_cache,len(g[1])))
by_hour.append(in_group_cache_total)
total_by_hour.append(len(group[1]))
total += len(group[1])
merged_data_3 = defaultdict(dict)
for contractor in broker_contract_ids.keys():
for key in broker_contract_ids[contractor]:
if contractor not in merged_data_3:
for i, k in enumerate(by_contract_id[key]):
if k[0] in merged_data_3[contractor]:
merged_data_3[contractor][k[0]][0] += k[1]
merged_data_3[contractor][k[0]][1] += k[2]
else:
merged_data_3[contractor][k[0]] = [k[1], k[2]]
for contractor in broker_contract_ids.keys():
tJig = pd.DataFrame(merged_data_3[contractor]).T
if (merged_data_3[contractor]):
tJig["proportion"] = tJig[0]/tJig[1]
tPlot = tJig.plot(y="proportion", title=contractor + " proportion of same parameter requests on full hours (caching 3 hours) ", kind="line")#, width=0.75)
tPlot.set_ylim(0, 1.1)
tPlot.set_xlim(pd.to_datetime("2017-12-01"), pd.to_datetime("2017-12-04"))
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
df_new_dups_2 = df_dups.set_index("timestamp")
df_new_dups_2["pickup_timestamp"] = df_new_dups_2["pickup_timestamp"].dt.normalize()
merged_datas = []
for frq in [60,180,720]:
merged_data_3 = defaultdict(dict)
for contractor in broker_contract_ids.keys():
df_contractor = df_new_dups_2[df_new_dups_2["Broker_name"] == contractor]
if not df_contractor.empty:
df_contractor = df_contractor.sort_index()
date_range = pd.date_range(df_contractor.index.min(), df_contractor.index.max(), freq="30Min")
cum_total = 0
for i in range(len(date_range)-1):
between_dates = df_contractor[date_range[i+1]-pd.Timedelta(minutes=frq):date_range[i+1]]
total = len(between_dates)
non_dup_count = len(between_dates.drop_duplicates(["driver_age", "source_country_region_id", "broker_contract_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "rental_days"]))
cached = total - non_dup_count
proportion = 0 if total == 0 else cached/total
cum_total += total
merged_data_3[contractor][date_range[i+1]] = [cached, total, proportion, cum_total]
merged_datas.append(merged_data_3)
#import plotly.plotly as py
#import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
init_notebook_mode()
for broker in broker_contract_ids.keys():
tJigs = []
for merged_data in merged_datas:
tJig = pd.DataFrame(merged_data[broker]).T
if tJig.empty:
continue
tJig["proportion"] = tJig[2]
tJigs.append(tJig)
layout = go.Layout(
title = broker + " proportion of same parameter requests",
titlefont=dict(
family='Courier New, monospace'
),
xaxis=dict(
showgrid=True,
zeroline=True,
showline=True,
#ticks='',
showticklabels=True,
title="Date",
),
yaxis=dict(
showgrid=True,
zeroline=True,
showline=True,
#ticks='',
showticklabels=True,
range = [0,1],
title="Proportion of cacheable requests"
)
)
try:
data = [go.Scatter(x=tJigs[0].index, y=tJigs[0].proportion, name="Caching 1 hour"),
go.Scatter(x=tJigs[1].index, y=tJigs[1].proportion, name="Caching 3 hours"),
go.Scatter(x=tJigs[2].index, y=tJigs[2].proportion, name="Caching 12 hours")]
except:
pass
fig = go.Figure(data=data, layout=layout)
iplot(fig)
for broker in broker_contract_ids.keys():
tJigs = []
for merged_data in merged_datas:
tJig = pd.DataFrame(merged_data[broker]).T
if tJig.empty:
continue
tJigs.append(tJig)
layout = go.Layout(
title = broker + " absolute number of same requests",
titlefont=dict(
family='Courier New, monospace'
),
xaxis=dict(
showgrid=True,
zeroline=True,
showline=True,
#ticks='',
showticklabels=True,
title="Date",
),
yaxis=dict(
showgrid=True,
zeroline=True,
showline=True,
#ticks='',
showticklabels=True,
#range = [0,1],
title="Absolute count of cacheable requests"
)
)
data = []
try:
data = [go.Scatter(x=tJigs[0].index, y=tJigs[0][0], name="Caching 1 hour"),
go.Scatter(x=tJigs[1].index, y=tJigs[1][0], name="Caching 3 hours"),
go.Scatter(x=tJigs[2].index, y=tJigs[2][0], name="Caching 12 hours")]
except:
pass
if data:
fig = go.Figure(data=data, layout=layout)
iplot(fig)
fields = ["Contract_ID", "Contract_name", "Broker name"]
contract_df = pd.read_csv("broker_contracts.csv", skipinitialspace=True, usecols=fields)
contract_df["Broker name"] = contract_df["Broker name"].astype(str)
contract_df["Contract_name"] = contract_df["Contract_name"].astype(str)
# This holds contract names and which broker name corresponds to each contract id..
#contract_df = contract_df.set_index("Contract_ID")
print(contract_df.shape)
print(contract_df.dtypes)
# What different contractors are there?
contractors = set(contract_df["Broker name"])
print("Different contractors:", contractors)
# For broker, find which contract ids it has
from collections import defaultdict
broker_contract_ids = defaultdict(set)
for row in contract_df.itertuples():
broker_contract_ids[row[3]].add(row[0])
contract_df.columns = ["broker_contract_id", "Contract_name", "Broker_name"]
#print(df)
merged_requests = pd.merge(df, contract_df, on=["broker_contract_id"])
print(merged_requests.dtypes)
# Create new, merged table, with contractor name in each.
#2) source, pickup, return, driver age, same pickup_timestamp, same return_timestamp
merged_requests["timestamp"] = pd.to_datetime(merged_requests['timestamp'])
merged_requests["pickup_timestamp"] = pd.to_datetime(merged_requests['pickup_timestamp'])
df_new_dups_2 = merged_requests.set_index("timestamp")
df_new_dups_2["pickup_timestamp"] = df_new_dups_2["pickup_timestamp"].dt.normalize()
merged_datas = []
for frq in [60,180,720]:
merged_data_3 = defaultdict(dict)
for contractor in broker_contract_ids.keys():
df_contractor = df_new_dups_2[df_new_dups_2["Broker_name"] == contractor]
if not df_contractor.empty:
df_contractor = df_contractor.sort_index()
date_range = pd.date_range(df_contractor.index.min(), df_contractor.index.max(), freq="30Min")
cum_total = 0
for i in range(len(date_range)-1):
between_dates = df_contractor[date_range[i+1]-pd.Timedelta(minutes=frq):date_range[i+1]]
total = len(between_dates)
non_dup_count = len(between_dates.drop_duplicates(["driver_age", "source_country_region_id", "broker_contract_id",
"pickup_desk_id", "return_desk_id", "pickup_timestamp", "rental_days"]))
cached = total - non_dup_count
proportion = 0 if total == 0 else cached/total
cum_total += total
merged_data_3[contractor][date_range[i+1]] = [cached, total, proportion, cum_total]
merged_datas.append(merged_data_3)
for broker in broker_contract_ids.keys():
tJigs = []
for merged_data in merged_datas:
tJig = pd.DataFrame(merged_data[broker]).T
if tJig.empty:
continue
tJigs.append(tJig)
layout = go.Layout(
title = broker + " absolute number of same requests",
titlefont=dict(
family='Courier New, monospace'
),
xaxis=dict(
showgrid=True,
zeroline=True,
showline=True,
#ticks='',
showticklabels=True,
title="Date",
),
yaxis=dict(
showgrid=True,
zeroline=True,
showline=True,
#ticks='',
showticklabels=True,
#range = [0,1],
title="Absolute count of cacheable requests"
)
)
data = []
try:
data = [go.Scatter(x=tJigs[0].index, y=tJigs[0][0], name="Caching 1 hour"),
go.Scatter(x=tJigs[1].index, y=tJigs[1][0], name="Caching 3 hours"),
go.Scatter(x=tJigs[2].index, y=tJigs[2][0], name="Caching 12 hours")]
except:
pass
if data:
fig = go.Figure(data=data, layout=layout)
iplot(fig)